% Linear mixed model estimation 
% modified in 2017/02/18
% Nico's cityhall dataset is added, modified version is adopted

% ver05: New data set (added more variables) 

clc; clear all; close all;

%% Path
workpath = pwd;
savepath = [workpath, filesep, 'results'];

addpath('toolbox_xt');
addpath('toolbox_plot');

%% Setting
cityname = 'all';
loaddate = '20170217';
savedate = '20170217';

%% Load Data Here
% origorder pmsafips year y x1 x2 x3 x4 x5 x6 x7 x8 x9 x10 x11 x12 x13 x14 lat2k lon2k lati loni year_dummy* latj lonj ///
% pop2k_c hu2k_c landb_c waterb_c areab_c ch_lat ch_lon area_ua afact_ua min_dst min_dist2

dataname = [cityname, '_', loaddate, '.csv'];

% origorder, fips, year, nu_ijt
M0 = csvread(dataname, 2);

%% Outliers
% Rule 1: Getting rid of outliers
% larger than 55, then eyeball investigation
nobs_old = size(M0,1);
fips = M0(:,2);
year = M0(:,3);
ch_dist = M0(:,38); %vicinity distance from the city hall
outliers = csvread('outliers.csv');
drop_set = [];
for i=1:1:size(outliers,1)
    
    temp_M0 = M0(fips==outliers(i,1),:);
    temp_origorder = temp_M0(:,1);
    temp_ch_dist = temp_M0(:,38);
    [a,b] = sort(temp_ch_dist, 'descend');
    
    temp_origorder = temp_origorder(b,:);
    drop_set = [drop_set; temp_origorder(1:outliers(i,2),:)];
end
sel = zeros(size(M0,1),1);
for i=1:1:size(drop_set,1)
    sel = sel + (M0(:,1)==drop_set(i));
end
M0(logical(sel),:) = [];

% Rule2: Getting rid of outliers - observations
fips = M0(:,2);
year = M0(:,3);
ch_dist = M0(:,38); %vicinity distance from the city hall
sel = ch_dist>60; %60 miles
M0(sel,:) = [];

% after data selection
origorder = M0(:,1);
M = M0(:,2:end);

%% City selection: keep only fips that has larger than 1 transaction per pmsas
drop_n = 0;
fips = M0(:,2);
year = M0(:,3);
ch_dist = M0(:,38); %vicinity distance from the city hall

fips_val = unique(fips);
year_val = unique(year);
nnn = [];
for i=1:1:size(fips_val,1)
    temp_M = M0(fips==fips_val(i),:);
    temp_n = size(temp_M,1);
    nnn = [nnn; temp_n*ones(temp_n,1)];
    
    % let's forget about time for now
%     temp_year = year(fips==fips_val(i),:);
%     for j=1:1:size(year_val,1)
%         temp_n = size(temp_M(temp_year==year_val(j),:),1);
%         nnn = [nnn; temp_n*ones(temp_n,1)];
%     end
end

disp('------------------------------');
disp('Summary of sample selection');
disp('number cities we dropped');
size(unique(fips(nnn<=drop_n,:)))
M0 = M0(nnn>drop_n,:);

disp('number of total obs before outliers');
disp(nobs_old);
disp('number of total obs after outliers');
disp(size(M0,1));


%% Load name data
% dataname = ['name', '_', loaddate, '.csv'];
% name = csvread(dataname, 2); % I do this manually (import...)
load('name_20170217.mat');
name.pmsafips = combofips;
name.pmsaname = pmsa_name;
name.cmsaname = cmsa_name;
name.ntot     = n_tot;

%% Load tract information
% tract data 1: city hall distance 
% combofips ch_lat ch_lon lati loni min_dist min_dist2 pop2k pop09 hus2k landsqmi landsqmi_ua afact
tract00 = csvread('censustracts2_ch.csv',2);
% We get rid of tract centers outside of 60 miles form the city halls
nobs_old = size(tract00,1);
temp_dist = tract00(:,6);
tract00(temp_dist>60,:) = [];
disp('-------------');
disp('tract outliers');
disp('before');
disp(nobs_old);
disp('after');
disp(size(tract00,1));

% tract data 2: distance to the coast
% origorder d2c lati loni 
tract01 = csvread('censustracts2_d2c.csv',1);

tract.combofips = tract00(:,1);
tract.latch = tract00(:,2);
tract.lonch = tract00(:,3);
tract.lat = tract00(:,4);
tract.lon = tract00(:,5);
tract.min_dist = tract00(:,6);
tract.min_dist2 = tract00(:,7);
tract.min_pop2k = tract00(:,8);
tract.min_pop09 = tract00(:,9);
tract.min_hus2k = tract00(:,10);

tract.landsqmi = tract00(:,11);
tract.landsqmi_ua = tract00(:,12);
tract.afact = tract00(:,13);

% combofips level info
tract.unq = unique(tract.combofips);
tract.N   = length(tract.unq);
tract.n = zeros(tract.N,1);
for i=1:1:tract.N
    tract.n(i,1) = sum(tract.unq(i)==tract.combofips);
end

% add dist2coast data
tract.d2c = zeros(size(tract.lat));
tract.d2c_dist = zeros(size(tract.lat));
for i = 1:1:size(tract.lat,1);
    
    temp_a = (tract01(:,3)-tract.lat(i)).^2+(tract01(:,4)-tract.lon(i)).^2; %we do this becuase of the machine precision
    temp_ind = find(temp_a==min(temp_a));
    
    tract.d2c(i,1) = tract01(temp_ind,2);
    tract.d2c_dist(i,1) = min(temp_a);
    
end

%% Preliminary investigation
pmsafips = M(:,1);
pmsa_n   = size(unique(pmsafips),1); %number of unique pmsa
pmsa_val = unique(pmsafips);

pmsa_freq = zeros(pmsa_n,1);
for i=1:1:pmsa_n
    pmsa_freq(i,:) = sum(pmsafips == pmsa_val(i));
end
[pmsa_freq,sindex] = sort(pmsa_freq);
pmsa_val = pmsa_val(sindex,1);

sum(pmsa_freq<=15)
sum(20<=pmsa_freq & pmsa_freq<100)
sum(100<=pmsa_freq & pmsa_freq<500)
sum(500<=pmsa_freq & pmsa_freq<1000)
sum(1000<=pmsa_freq & pmsa_freq<inf)



%% Reorder data by number of observations
M_new = [];
for i=1:1:pmsa_n
    temp_M = M(pmsafips==pmsa_val(i),:);
    % store
    M_new = [M_new; temp_M];
end
% Build regressors with a new order
% outsheet origorder pmsafips year y x1 x2 x3 x4 x5 x6 x7 x8 x9 x10 x11 x12 x13 x14 
% lati loni year_dummy* latj lonj ///
% pop2k_c hu2k_c landb_c waterb_c areab_c ch_lat ch_lon area_ua afact_ua min_dist min_dist2 dist2coast ///
pmsafips     = M_new(:,1);
year         = M_new(:,2);
y            = M_new(:,3);
x1           = M_new(:,4);
x2           = M_new(:,5);
x3           = M_new(:,6);
x4           = M_new(:,7);
x5           = M_new(:,8);
x6           = M_new(:,9);
x7           = M_new(:,10);
x8           = M_new(:,11);
x9           = M_new(:,12);
x10          = M_new(:,13);
x11          = M_new(:,14);
x12          = M_new(:,15);
x13          = M_new(:,16);
x14          = M_new(:,17); %size

lati         = M_new(:,18);
loni         = M_new(:,19);
year_dummy1  = M_new(:,20);
year_dummy2  = M_new(:,21);
year_dummy3  = M_new(:,22);
year_dummy4  = M_new(:,23);
year_dummy5  = M_new(:,24);
year_dummy6  = M_new(:,25);
latj         = M_new(:,26); %city center latitude
lonj         = M_new(:,27); %city center longitude
pop2k_c      = M_new(:,28); %area2 (land only)
hu2k_c       = M_new(:,29); %area3 (land+water)

area_l = M_new(:,30); %land area landb_c 
area_w = M_new(:,31); %water area waterb_c 
area_t = M_new(:,32); %land+water areab_C

latch = M_new(:,33); %city-hall lat
lonch = M_new(:,34); %city-hall lon

area_ua  = M_new(:,35); % urban area
afact_ua = M_new(:,36); % urban area-afact

ch_dist = M_new(:,37); %vicinity distance from the city hall
ch_dist2 = M_new(:,38); %our distance from the city hall

dist2coast = M_new(:,39); %distance from the coastal line

fullx = [x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14];
year_val = unique(year);

%% Get area information for each pmsa
area     = [];
area_j   = [];
area_j_l = [];
area_j_w = [];
area_j_t = [];
area_j_ua = [];

Xijt = [];
Yijt = [];
Aijt = [];

Xijt_l = [];
Yijt_l = [];
Aijt_l = [];

Xijt_t = [];
Yijt_t = [];
Aijt_t = [];

Xijt_ua = [];
Yijt_ua = [];
Aijt_ua = [];

Xijt_un = [];
Yijt_un = [];
Aijt_un = [];

U_area = [];
L_area = [];

pmsa_ch.loc = cell(pmsa_n,1); %city hall locations
pmsa_ch.combofips = zeros(pmsa_n,1); %city hall locations

for i=1:1:pmsa_n
    
    % store cityhall location    
    temp_latch  = latch(pmsafips==pmsa_val(i),:);
    temp_lonch  = lonch(pmsafips==pmsa_val(i),:);
    
    [unq_latch,ia,ic] = unique(temp_latch);
    pmsa_ch.loc{i} = [unq_latch, temp_lonch(ia)];
    pmsa_ch.combofips(i) = pmsa_val(i);
    temp_ch_n = length(unq_latch);
    
    
    % Other information, area normalized by number of city halls
    temp_area_l = area_l(pmsafips==pmsa_val(i),:)/temp_ch_n;
    temp_area_w = area_w(pmsafips==pmsa_val(i),:)/temp_ch_n;
    temp_area_t = area_t(pmsafips==pmsa_val(i),:)/temp_ch_n;
    temp_area_ua = area_ua(pmsafips==pmsa_val(i),:)/temp_ch_n;
    
%     temp_area_l = area_l(pmsafips==pmsa_val(i),:);
%     temp_area_w = area_w(pmsafips==pmsa_val(i),:);
%     temp_area_t = area_t(pmsafips==pmsa_val(i),:);
    
    temp_lati  = lati(pmsafips==pmsa_val(i),:);
    temp_loni  = loni(pmsafips==pmsa_val(i),:);

    
    % Euclidean space
    tempYij = 69.172*(temp_lati - temp_latch);
    tempXij = 69.172*cos((temp_lati + temp_latch)/2).*(temp_loni - temp_lonch);
    tempZij = sqrt(tempXij.^2 + tempYij.^2);
    
    % get area
    tempX = max(tempXij)-min(tempXij);
    tempY = max(tempYij)-min(tempYij);
    tempA = (tempX^2+tempY^2)/temp_ch_n;
    
    % normalized location
    Xijt = [Xijt; tempXij/sqrt(tempA)];
    Yijt = [Yijt; tempYij/sqrt(tempA)];
    
    % un-normalized location
    Xijt_un = [Xijt_un; tempXij];
    Yijt_un = [Yijt_un; tempYij];
    
    % angle
    temp_angle = atan2(tempYij/sqrt(tempA),tempXij/sqrt(tempA));
    Aijt = [Aijt; temp_angle];
    
    % different normalization - by urban
    Xijt_ua = [Xijt_ua; tempXij./sqrt(temp_area_ua)];
    Yijt_ua = [Yijt_ua; tempYij./sqrt(temp_area_ua)];
	temp_angle = atan2(tempYij./sqrt(temp_area_ua),tempXij./sqrt(temp_area_ua));
    Aijt_ua = [Aijt_ua; temp_angle];

    U_area = [U_area; temp_area_ua];
    
    % different normalization - by land area
    Xijt_l = [Xijt_l; tempXij./sqrt(temp_area_l)];
    Yijt_l = [Yijt_l; tempYij./sqrt(temp_area_l)];
	temp_angle = atan2(tempYij./sqrt(temp_area_l),tempXij./sqrt(temp_area_l));
    Aijt_l = [Aijt_l; temp_angle];

    L_area = [L_area; temp_area_l];
    
    % different normalization
    Xijt_t = [Xijt_t; tempXij./sqrt(temp_area_t)];
    Yijt_t = [Yijt_t; tempYij./sqrt(temp_area_t)];
	temp_angle = atan2(tempYij./sqrt(temp_area_t),tempXij./sqrt(temp_area_t));
    Aijt_t = [Aijt_t; temp_angle];
    
    % store
    area = [area; tempA*ones(size(temp_lati))];
    area_j = [area_j; tempA];
    
    area_j_l = [area_j_l; temp_area_l(1,:)];
    area_j_w = [area_j_w; temp_area_w(1,:)];
    area_j_t = [area_j_t; temp_area_t(1,:)];
    area_j_ua = [area_j_ua; temp_area_ua(1,:)];
    
end

Zijt_ua = sqrt(Xijt_ua.^2 + Yijt_ua.^2);
Zijt_un = sqrt(Xijt_un.^2 + Yijt_un.^2);

%% Data 

% regressors
fix_year = 0; %fix year at 2005, 0 means include all
if fix_year ==0
    sel = year == year;
else
    sel = year == fix_year;
end
temp_y     = y(sel,:);
temp_fullx = fullx(sel,:);
temp_z     = Zijt_un(sel,:);
temp_z1     = Zijt_ua(sel,:);
temp_pmsafips = pmsafips(sel,:);
temp_area_ua  = area_ua(sel,:);
temp_area_l   = area_l(sel,:);
temp_area_ua2 = U_area(sel,:);
temp_area_l2  = L_area(sel,:);
temp_dist2coast = dist2coast(sel,:);

temp_Xijt_un = Xijt_un(sel,:);
temp_Yijt_un = Yijt_un(sel,:);
temp_ch_dist = ch_dist(sel,:);
temp_ch_dist2 = ch_dist2(sel,:);

temp_lati = lati(sel,:);
temp_loni = loni(sel,:);

temp_year = year(sel,:);

temp_year_dummy1 = year_dummy1(sel,:);
temp_year_dummy2 = year_dummy2(sel,:);
temp_year_dummy3 = year_dummy3(sel,:);
temp_year_dummy4 = year_dummy4(sel,:);
temp_year_dummy5 = year_dummy5(sel,:);
temp_year_dummy6 = year_dummy6(sel,:);

temp_latch = latch(sel,:);
temp_lonch = lonch(sel,:);


YY = temp_y; % log price
XX = [temp_fullx];  %lot-level regressors
DD = [(temp_ch_dist)]; %distance, coefficient is heterogenous over cities
logDD = log(DD);
DC = temp_dist2coast; %disttance from the coast
logDC = log(temp_dist2coast); %disttance from the coast

ZZ = [ones(size(XX,1),1), log(temp_area_ua)]; %urbanized area, city characteristics in prior
xtflag = temp_pmsafips;
xtval  = unique(xtflag);

% order the data by xtval
YY    = xt_set(YY,xtflag,xtval);
XX    = xt_set(XX,xtflag,xtval);
DD    = xt_set(DD,xtflag,xtval);
logDD = xt_set(logDD,xtflag,xtval);
log1DD = log(1+DD);

DC    = xt_set(DC,xtflag,xtval);
logDC = xt_set(logDC,xtflag,xtval);

ZZ    = xt_set(ZZ,xtflag,xtval);

Xijt_un = xt_set(temp_Xijt_un,xtflag,xtval);
Yijt_un = xt_set(temp_Yijt_un,xtflag,xtval);

dist2coast = xt_set(temp_dist2coast, xtflag, xtval);

ch_dist = xt_set(temp_ch_dist,xtflag,xtval);
ch_dist2 = xt_set(temp_ch_dist2,xtflag,xtval);

lati = xt_set(temp_lati,xtflag,xtval);
loni = xt_set(temp_loni,xtflag,xtval);

year   = xt_set(temp_year,xtflag,xtval);
year_dummy1   = xt_set(temp_year_dummy1,xtflag,xtval);
year_dummy2   = xt_set(temp_year_dummy2,xtflag,xtval);
year_dummy3   = xt_set(temp_year_dummy3,xtflag,xtval);
year_dummy4   = xt_set(temp_year_dummy4,xtflag,xtval);
year_dummy5   = xt_set(temp_year_dummy5,xtflag,xtval);
year_dummy6   = xt_set(temp_year_dummy6,xtflag,xtval);

latch = xt_set(temp_latch, xtflag, xtval);
lonch = xt_set(temp_lonch, xtflag, xtval);

xtflag = xt_set(xtflag,xtflag,xtval);

[~, ZZ2] = xt_mean(ZZ(:,2), xtflag, xtval); %collapse by xtval
ZZ2 = [ones(size(ZZ2)), ZZ2];

xtflag2 = year;
xtval2 = unique(year);

%% Additional covariates

% city hall dummy 
chdum = [];
chdef.lon = cell(length(xtval),1);
chdef.lat = cell(length(xtval),1);
nch = zeros(length(xtval),1);
for j=1:1:length(xtval)
    nch(j,1) = length(unique(lonch(xtflag==xtval(j))));
    
    temp_lonch = lonch(xtflag==xtval(j));
    temp_latch = latch(xtflag==xtval(j));
    temp_unq = unique(temp_lonch);
    
    temp_dum = zeros(size(temp_lonch,1),3);
    temp_lonch2 = [];
    temp_latch2 = [];
    for i=1:1:length(temp_unq)
        temp_dum(:,i) = temp_lonch==temp_unq(i);
        
        temp_lonch2 = [temp_lonch2; unique(temp_lonch(temp_lonch==temp_unq(i)))];
        temp_latch2 = [temp_latch2; unique(temp_latch(temp_lonch==temp_unq(i)))];
    end
    chdef.lon{j} = temp_lonch2;
    chdef.lat{j} = temp_latch2;
    
    chdum = [chdum; temp_dum];
end


%% Save data 
save('data_all.mat'); 

%% Data properties
% sample statistics
disp('mean lot size')
disp(mean(exp(XX(:,14))))
disp('median lot size')
disp(median(exp(XX(:,14))))

% number of obs by time
n_year = [];
for i=1:1:length(year_val)
    n_year = [n_year; sum(year==year_val(i))];
end
disp(n_year)
disp(n_year/sum(n_year)*100)

% number of obs by combofips
n_combofips = [];
for i=1:1:length(xtval)
    n_combofips = [n_combofips; sum(xtflag == xtval(i))];
end

% mean and median of key covariates
tab_name = {'nopu', 'commercial', 'industrial', 'retail', 'singlefamily', ....
    'multifamily', 'office', 'apartment', 'holdfordevelopment', 'holdforinvestment', ...
    'mixeduse', 'medical', 'parking', 'log size', 'log dist. to coast'}';
tab_head = ['mean', 'median', 'std'];

temp_cov = [XX(:,1:13)*100, exp(XX(:,14)), dist2coast];
tab_desc = [(1:1:15)',mean(temp_cov)', median(temp_cov)', std(temp_cov)'];

[tab_name, num2cell(tab_desc)]

disp('single-family, multifamily, or apartments')
sum(mean(temp_cov(:,[5,6,8])))

disp('held for development or investment')
sum(mean(temp_cov(:,[9,10])))

disp('no purpose')
sum(mean(temp_cov(:,[1])))




%% Figures 0 - number of obs
sub_fig_nobs;

%% Figures 1 - coastal line 
sub_fig_coast;




